Bivariate Analysis between GDP per capita, Sanitation and Life Expectancy across Nations in 2010
Aman Das [BS2206]
Raj Pratap Singh [BS2219]
Shreyansh Mukhopadhyay [BS2147]
Our Aim is to determine which features more directly affect the Life Expectation of the citizens.
\(~\)
This presentation demonstrates the capabilities of Bivariate Analysis on datasets, to infer relationship between various features of Nations.
script.dir <- getSrcDirectory(function(x) {x})
setwd(script.dir)
numerise = function(x){
x[grepl("k$", x)] <- as.numeric(sub("k$", "", x[grepl("k$", x)]))*10^3
x <- as.numeric(x)
return(x)
}
d1_raw = read.csv(file.path(".","Data","gdp.csv"), fileEncoding = 'UTF-8-BOM')
d2_raw = read.csv(file.path(".","Data","sanitation.csv"), fileEncoding = 'UTF-8-BOM')
d3_raw = read.csv(file.path(".","Data","life_expectancy.csv"), fileEncoding = 'UTF-8-BOM')
yearname = "X2010"
d1 = d1_raw[!is.na(numerise(d1_raw[, yearname])),][,c("country", yearname)]
colnames(d1)[2] = "lngdp"
d2 = d2_raw[!is.na(numerise(d2_raw[, yearname])),][,c("country", yearname)]
colnames(d2)[2] = "snt"
d3 = d3_raw[!is.na(numerise(d3_raw[, yearname])),][,c("country", yearname)]
colnames(d3)[2] = "lfx"
dtemp = merge(x = d1, y = d2, by = "country")
d = merge(x = dtemp, y = d3, by = "country")
d$lngdp = log(numerise(d$lngdp))
write.csv(d, "./Data/assembled.csv")
kable(head(d, 6L))| country | lngdp | snt | lfx |
|---|---|---|---|
| Afghanistan | 6.265301 | 34.9 | 60.5 |
| Albania | 8.183118 | 95.2 | 78.1 |
| Algeria | 8.273847 | 87.0 | 74.5 |
| Andorra | 10.454495 | 100.0 | 81.8 |
| Angola | 8.291547 | 41.1 | 60.2 |
| Antigua and Barbuda | 9.546813 | 86.3 | 75.9 |
Mean or Arithmetic Mean \(\bar{x}\), Geometric Mean \(\operatorname{GM}(x)\), Harmonic Mean \(\operatorname{HM}(x)\), Median \(\operatorname{median}(x)\) and Mode \(\operatorname{mode}(x)\) are some measures of central tendency in the sample.
\[ \begin{aligned} \operatorname{mean}(x)=\bar{x} = \frac{1}{n} \sum _{i=1}^{n}(x_{i}) && \operatorname{GM}(x) = \sqrt[n]{\prod_{i=1}^n} x_i \end{aligned} \] \[ \operatorname{HM}(x)= n\sum_{i=1}^n x_i^{-1} \] \[ \begin{aligned} \operatorname{median}(x)=\begin{cases} x_{(n + 1)/ 2} &: n = 1 \mod{2} \\ \frac{x_{(n/2)} + x_{((n/2)+1)}}{2} &:n = 0 \mod{2} \end{cases} && \operatorname{mode}(x) = x_{(n)} \end{aligned} \]
Note: \(x_i\) is the ith observation. \(x_{(i)}\) is the ith largest observation.
getmode <- function(v) {
uniqv <- unique(v)
freq = max(tabulate(match(v, uniqv)))
res = uniqv[which.max(tabulate(match(v, uniqv)))]
if (freq == 1) res = NULL
return(res)
}
d_central = data.frame(
row.names = "Variable",
Variable = c(
"*ln(GDP)*",
"*Sanitation*",
"*Life Exp.*"
),
Mean = c(
mean(d$lngdp),
mean(d$snt),
mean(d$lfx)
),
GM = c(
geometric.mean(d$lngdp),
geometric.mean(d$snt),
geometric.mean(d$lfx)
),
HM = c(
harmonic.mean(d$lngdp),
harmonic.mean(d$snt),
harmonic.mean(d$lfx)
),
Median = c(
median(d$lngdp),
median(d$snt),
median(d$lfx)
),
Mode = c(
getmode(d$lngdp),
getmode(d$snt),
getmode(d$lfx)
)
)
kable(
d_central,
col.names = c(
"$\\bar{x}$",
"$\\operatorname{GM}(x)$",
"$\\operatorname{HM}(x)$",
"$\\operatorname{median}(x)$",
"$\\operatorname{mode}(x)$"
),
digits=5
)| \(\bar{x}\) | \(\operatorname{GM}(x)\) | \(\operatorname{HM}(x)\) | \(\operatorname{median}(x)\) | \(\operatorname{mode}(x)\) | |
|---|---|---|---|---|---|
| ln(GDP) | 8.54124 | 8.42229 | 8.30248 | 8.48673 | 9.23014 |
| Sanitation | 72.43857 | 62.58904 | 47.61862 | 85.60000 | 100.00000 |
| Life Exp. | 70.54603 | 69.95538 | 69.28316 | 72.40000 | 73.20000 |
Range \(\operatorname{range}(x)\), Semi-Interquartile Range \(\operatorname{SIR}(x)\), Mean Deviation about x’ \(\operatorname{MD}_{(x')}(x)\), Variance \(s_x^2\), Standard Deviation \(s_x\) are some measures of dispersion in the sample.
\[ \begin{aligned} \operatorname{Range}(x)=|x_{(n)} - x_{(1)}| && \ Q_1 = \operatorname{median}(x_{(1)}, \ldots ,x_{(\lfloor \frac{n+1}{2} \rfloor)}) && \end{aligned} \] \[ \begin{aligned} \ Q_3 = \operatorname{median}(x_{(\lfloor \frac{n+2}{2} \rfloor)}, \ldots , x_{(n)}) && \operatorname{MD}_{(x')}(x) = \operatorname{mean}(|x_i-x'|) \end{aligned} \] \[ \begin{aligned} \operatorname{SIR}(x)=\frac{|Q_1-Q_3|}{2} && s_x = \sqrt{\operatorname{mean}([x_i - \bar{x}]^2)} && s^2_x= (s_x)^2 \end{aligned} \]
getmd = function(x, center = mean(x)){
md = mean(
abs(
x - rep(center, length(x))
)
)
return(md)
}
d_disp = data.frame(
row.names = "Variable",
Variable = c(
"*ln(GDP)*",
"*Sanitation*",
"*Life Exp.*"
),
Range = c(
max(d$lngdp) - min(d$lngdp),
max(d$snt) - min(d$snt),
max(d$lfx) - min(d$lfx)
),
SIR = c(
IQR(d$lngdp)/2,
IQR(d$snt)/2,
IQR(d$lfx)/2
),
MD = c(
getmd(d$lngdp),
getmd(d$snt),
getmd(d$lfx)
),
variance = c(
(sd(d$lngdp))^2,
(sd(d$snt))^2,
(sd(d$lfx))^2
),
SD = c(
sd(d$lngdp),
sd(d$snt),
sd(d$lfx)
)
)
kable(
d_disp,
col.names = c(
"$\\operatorname{Range}(x)$",
"$\\operatorname{SIR}(x)$",
"$\\operatorname{MD}_{(\\bar{x})}(x)$",
"$\\quad \\quad \\quad \\quad s_x^2$",
"$\\quad \\quad \\quad \\quad s_x$"
),
digits=5
)| \(\operatorname{Range}(x)\) | \(\operatorname{SIR}(x)\) | \(\operatorname{MD}_{(\bar{x})}(x)\) | \(\quad \quad \quad \quad s_x^2\) | \(\quad \quad \quad \quad s_x\) | |
|---|---|---|---|---|---|
| ln(GDP) | 6.04435 | 1.06914 | 1.17229 | 2.01791 | 1.42053 |
| Sanitation | 94.03000 | 24.65000 | 25.50487 | 872.29346 | 29.53461 |
| Life Exp. | 50.80000 | 6.00000 | 6.98712 | 75.33494 | 8.67957 |
Coefficients of Skewness \(g_1\) and Kurtosis \(g_2\) describe the symmetry and extremity of tails of the sample distribution.
\[ \begin{aligned} m_k = \operatorname{mean}([x-\bar{x}]^k) && g_1 = \frac{m_3}{{m_2}^{\frac{3}{2}}} && g_2 = \frac{m_4}{{m_2}^2} \end{aligned} \]
d_shape = data.frame(
row.names = "Variable",
Variable = c(
"*ln(GDP)*",
"*Sanitation*",
"*Life Exp.*"
),
Skewness = c(
skewness(d$lngdp),
skewness(d$snt),
skewness(d$lfx)
),
Kurtosis = c(
kurtosis(d$lngdp),
kurtosis(d$snt),
kurtosis(d$lfx)
)
)
kable(
d_shape,
col.names = c(
"$g_1$",
"$g_2$"
),
digits=5
)| \(g_1\) | \(g_2\) | |
|---|---|---|
| ln(GDP) | 0.09619 | 2.14435 |
| Sanitation | -0.85989 | 2.27111 |
| Life Exp. | -0.87903 | 4.02465 |
Density plots help us in estimating the form, location and spread of the distribution.
labelfunction = function(val1){
return(list(c(
"log of GDP per capita",
"Sanitation Access %",
"Life Expectancy"
)))
}
ggplot(stack(d[2:4]), mapping = aes(x = values))+
geom_density(aes(color=ind), linewidth=rel(1.5))+
labs(
x=NULL,
y=NULL
)+
mytheme+
mycolor+
facet_wrap(~ind, scales="free", labeller = labelfunction, ncol=1)+
theme(legend.position="none",
strip.text.x = element_text(size = rel(1.5))
)Box plots help us detect potential outliers. They also help us in estimating location and skewness of the distribution.
labelfunction = function(val1){
return(list(c(
"log of GDP per capita",
"Sanitation Access %",
"Life Expectancy"
)))
}
ggplot(stack(d[2:4]), mapping = aes(y = values))+
geom_boxplot(aes(fill=ind), alpha=0.6)+
labs(
x=NULL,
y=NULL
)+
mytheme+
mycolor+
facet_wrap(~ind, scales="free", labeller = labelfunction)+
theme(axis.text.x=element_blank(),
legend.position="none",
strip.text.x = element_text(size = rel(1.5)),
panel.grid.minor.x = element_blank(),
panel.grid.major.x = element_blank()
)A Scatter plot helps us estimate the type of relationship between variables.
seems like Linear correlation
Covariance \(\operatorname{cov}(x, y)\) is a measure of the joint variability of two random variables \(x\), \(y\).
Correlation is any relationship, causal or spurious, between two random variables \(x\), \(y\). Pearson’s correlation coefficient \(r\) estimates the linear correlation.
\[ \begin{aligned} \operatorname {cov} (x,y)={\frac {\sum _{i=1}^{n}(x_{i}-\bar{x})(y_{i}-\bar{y})}{n}} && r_{x,y}= \frac{\operatorname{cov}(x,y)}{s_x s_x} \end{aligned} \]
| lngdp | snt | lfx | |
|---|---|---|---|
| lngdp | 2.01791 | 33.84045 | 9.52202 |
| snt | 33.84045 | 872.29346 | 208.54155 |
| lfx | 9.52202 | 208.54155 | 75.33494 |
\[A_{i,j} = \operatorname{cov}(x_i, x_j)\]
Spearman’s \(r_s\), and Kendall’s \(\tau\) are some other correlation coefficients besides Pearson’s \(r\)
d_cor = data.frame(
row.names = "Variable",
Variable = c(
"*Sanitation vs. ln(GDP)*",
"*Life Exp. vs. ln(GDP)*",
"*Life Exp. vs. Sanitation*"
),
Pearson = c(
cor(d$snt, d$lngdp, method="pearson"),
cor(d$lfx, d$lngdp, method="pearson"),
cor(d$lfx, d$snt, method="pearson")
),
Spearman = c(
cor(d$snt, d$lngdp, method="spearman"),
cor(d$lfx, d$lngdp, method="spearman"),
cor(d$lfx, d$snt, method="spearman")
),
Kendall = c(
cor(d$snt, d$lngdp, method="kendall"),
cor(d$lfx, d$lngdp, method="kendall"),
cor(d$lfx, d$snt, method="kendall")
)
)
kable(
d_cor,
digit = 5,
col.names = c(
"*Pearson's* $r$",
"*Spearman's* $r_s$",
"*Kendall's* $\\tau$"
)
)| Pearson’s \(r\) | Spearman’s \(r_s\) | Kendall’s \(\tau\) | |
|---|---|---|---|
| Sanitation vs. ln(GDP) | 0.80659 | 0.85920 | 0.67458 |
| Life Exp. vs. ln(GDP) | 0.77229 | 0.81639 | 0.62168 |
| Life Exp. vs. Sanitation | 0.81351 | 0.83513 | 0.63744 |
Good linear correlation lets try to observe line of best fit.
Simple Univariate Linear Regression is a method for estimating the relationship \(y_i=f(x_i)\) of a response variable \(y\) with a predictor variable \(x\), as a line that closely fits the \(y\) vs. \(x\) scatter plot.
\[ y_i = \hat{a} + \hat{b} x_i + e_i \]
Where \(\hat{a}\) is the intercept, \(\hat{b}\) is the slope, and \(e_i\) is the ith residual error. We aim to minimize \(e_i\) for better fit.
Ordinary Least squares method reduces \(e_i\) by minimizing error sum of squares \(\sum{{e_i}^2}\).
Coefficient of Determination \(R^2\) is the proportion of the variation in \(y\) predictable by the model.
\[ \begin{aligned} \hat{b} = r\frac{s_y}{s_x} && \hat{a} = \bar{y} - \hat{b}\bar{x} && R^2 = 1 - \frac{\sum{{e_i}^2}}{\sum{(y-\bar{y}})^2} \end{aligned} \]
olssmry = function(
d, x_map, y_map,
x_lab=waiver(), y_lab=waiver(),
title=waiver()
){
model = lm(formula=y_map~x_map)
smry = summary(model, signif.stars=TRUE)
smryvec = c(
as.numeric(model$coefficients["(Intercept)"]),
as.numeric(model$coefficients["x_map"]),
smry$r.squared
)
return(smryvec)
}
olstab = t(data.frame(
SvG = olssmry(d, d$lngdp, d$snt),
LvG = olssmry(d, d$lngdp, d$lfx),
LvS = olssmry(d, d$snt, d$lfx)
))
row.names(olstab) = c(
"*Sanitation vs. ln(GDP)*",
"*Life Exp. vs. ln(GDP)*",
"*Life Exp. vs. Sanitation*"
)
kable(
olstab,
digit = 5,
col.names=c(
"$\\hat{a}$",
"$\\hat{b}$",
"$R^2$"
)
)| \(\hat{a}\) | \(\hat{b}\) | \(R^2\) | |
|---|---|---|---|
| Sanitation vs. ln(GDP) | -70.79844 | 16.77006 | 0.65059 |
| Life Exp. vs. ln(GDP) | 30.24203 | 4.71876 | 0.59643 |
| Life Exp. vs. Sanitation | 53.22795 | 0.23907 | 0.66180 |
Least absolute Deviation method reduces \(e_i\) by minimizing the sum of absolute deviations \(\sum{|e_i|}\).
ladsmry = function(
d, x_map, y_map,
x_lab=waiver(), y_lab=waiver(),
title=waiver()
){
model = rq(formula=y_map~x_map)
smry = summary(model)
smryvec = c(
as.numeric(model$coefficients[1]),
as.numeric(model$coefficients[2])
)
return(smryvec)
}
olstab = t(data.frame(
SvG = ladsmry(d, d$lngdp, d$snt),
LvG = ladsmry(d, d$lngdp, d$lfx),
LvS = ladsmry(d, d$snt, d$lfx)
))
row.names(olstab) = c(
"*Sanitation vs. ln(GDP)*",
"*Life Exp. vs. ln(GDP)*",
"*Life Exp. vs. Sanitation*"
)
kable(
olstab,
digit = 5,
col.names=c(
"$\\hat{a}$",
"$\\hat{b}$"
)
)| \(\hat{a}\) | \(\hat{b}\) | |
|---|---|---|
| Sanitation vs. ln(GDP) | -71.23153 | 16.80472 |
| Life Exp. vs. ln(GDP) | 31.99047 | 4.61340 |
| Life Exp. vs. Sanitation | 53.73041 | 0.23963 |
Plotting the estimated Linear Models on the Scatter Plots.
linearplot = function(
d, x_map, y_map,
x_lab=waiver(), y_lab=waiver(),
title=waiver()
){
olsvec = round(olssmry(d, x_map, y_map), digit=5)
ladvec = round(ladsmry(d, x_map, y_map), digit=5)
capstr = TeX(paste("$y_i =", olsvec[1], "+", olsvec[2], "~x_i + e_i$", "\t\t",
"$y_i' =", ladvec[1], "+", ladvec[2], "~x_i + e_i'$"))
plot1 = ggplot(d, mapping = aes(x = x_map, y = y_map))+
geom_point(
alpha=0.6
)+
mytheme+
labs(
x=x_lab,
y=y_lab,
title=title,
caption=capstr,
parse=TRUE
)+
geom_smooth(
method="lm",
formula=y~x,
se=FALSE,
aes(color = "Ordinary Least Squares")
)+
geom_smooth(
method="rq",
formula=y~x,
se=FALSE,
aes(color = "Least Absolute Deviation")
)+
labs(
color="Linear Model"
)+
mycolor+
theme(
plot.caption = element_text(hjust=0.5, color="#504945")
)
return(plot1)
}Partial Correlation is the relationship between two variables \(x\), \(y\) of interest, after removing effect of some other related variable \(z\).
\[ \begin{aligned} &x_i = \hat{a_x} + \hat{b_x} z_i + e_{x,i} && y_i = \hat{a_y} + \hat{b_y} z_i + e_{y,i} \\ &\Rightarrow r_{x,y;z} = r_{e_{x}, e_{y}} \end{aligned} \]
| Partial Correlation | |
|---|---|
| Sanitation vs. ln(GDP) | 0.4826925 |
| Life Exp. vs. ln(GDP) | 0.3377892 |
| Life Exp. vs. Sanitation | 0.5075384 |